#install.packages("knitr")
#install.packages("grid")
install.packages("plotly")
Installing package into 㤼㸱C:/Users/ASUS/Documents/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/plotly_4.9.3.zip'
Content type 'application/zip' length 3118743 bytes (3.0 MB)
downloaded 3.0 MB
package ‘plotly’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\ASUS\AppData\Local\Temp\Rtmp0kNO3m\downloaded_packages
#install.packages("dprep")
#install.packages("normalr")
#install.packages("ggcorrplot")

#install.packages("RColorBrewer")
#install.packages("rgdal")
#install.packages("jsonlite")
#install.packages("readr")
#install.packages("readr")


library(gridExtra)
library(dplyr)
library(lubridate)
library(magrittr)
library(ggplot2)
library(tidyr)
library(knitr)
library(normalr)
library(ggcorrplot)

library(leaflet)
library(plotly)
package 㤼㸱plotly㤼㸲 was built under R version 4.0.3Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: 㤼㸱plotly㤼㸲

The following object is masked from 㤼㸱package:ggplot2㤼㸲:

    last_plot

The following object is masked from 㤼㸱package:stats㤼㸲:

    filter

The following object is masked from 㤼㸱package:graphics㤼㸲:

    layout
library(RColorBrewer)
library(readr)
#library(MLRMPA)
#??src_mysql
my_db <- src_mysql(
  dbname = "coronavirus",
  host = "localhost",
  user = "root",
  password = "1234"
)
my_db
src:  mysql 8.0.21 [root@localhost:/coronavirus]
tbls: avg_world_temp_2020, covid_thai, covid_thailand, covid_us, covid19_confirmed, covid19_deaths, covid19_recovered, covidus,
  data_congestion, data_distance, data_ethnic, data_gender, data_lockdown, data_population, data_state_abr, gdp, gdp19, healthranking,
  population, pornhub, sars, sars_2003, sars_2003_update, top20pornhub
##import data
df_conf <- tbl(my_db, sql("select * from covid19_confirmed "))
df_conf <- as.data.frame(df_conf)
df_conf
df_deaths <- tbl(my_db, sql("select * from covid19_deaths "))
df_deaths <- as.data.frame(df_deaths)
df_deaths
df_recover <- tbl(my_db, sql("select * from covid19_recovered "))
df_recover <- as.data.frame(df_recover)
df_recover
##check the time frame of the data
n.col <- ncol(df_conf)
dates <- names(df_conf)[5:n.col]%>% mdy()
range(dates)
[1] "2020-01-22" "2021-01-14"
min.date <- min(dates)
max.date <- max(dates)
min.date.txt <- min.date %>% format('%d %b %Y')
max.date.txt <- max.date %>% format('%d %b %Y')
#clean data
cleanData <- function(data) {
  ## remove some columns
  data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
  ## convert from wide to long format
  data %<>% gather(key=date, value=count, -country)
  ## convert from character to date
  data %<>% mutate(date = date %>% mdy())
  ## aggregate by country
  data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
  return(data)
}
## clean the three data sets
data.confirmed <- df_conf %>% cleanData() %>% rename(confirmed=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data.deaths <- df_deaths %>% cleanData() %>% rename(deaths=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data.recovered <- df_recover %>% cleanData() %>% rename(recovered=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data <- data.confirmed %>% merge(data.deaths, all=T) %>% merge(data.recovered, all=T)
data
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
data 
data.world <- data %>% group_by(date) %>%
  summarise(country='World',
            confirmed = sum(confirmed, na.rm=T),
            deaths = sum(deaths, na.rm=T),
            recovered = sum(recovered, na.rm=T))
`summarise()` ungrouping output (override with `.groups` argument)
data %<>% rbind(data.world)
data
data %<>% mutate(current.confirmed = confirmed - deaths - recovered)
data
NA
#rate
data %<>% arrange(country, date)
n <- nrow(data)
day1 <- min(data$date)
data %<>% mutate(new.confirmed = ifelse(date == day1, NA, confirmed - lag(confirmed, n=1)),
                 new.deaths = ifelse(date == day1, NA, deaths - lag(deaths, n=1)),
                 new.recovered = ifelse(date == day1, NA, recovered - lag(recovered, n=1)))
data %<>% mutate(new.confirmed = ifelse(new.confirmed < 0, 0, new.confirmed),
                 new.deaths = ifelse(new.deaths < 0, 0, new.deaths),
                 new.recovered = ifelse(new.recovered < 0, 0, new.recovered))
## death rate based on total deaths and recovered cases
data %<>% mutate(rate.upper = (100 * deaths / (deaths + recovered)) %>% round(1))
## lower bound: death rate based on total confirmed cases
data %<>% mutate(rate.lower = (100 * deaths / confirmed) %>% round(1))
## death rate based on the number of death/recovered on every single day
data %<>% mutate(rate.daily = (100 * new.deaths / (new.deaths + new.recovered)) %>% round(1))
View(data)
## convert from wide to long format
data.long <- data %>%
  select(c(country, date, confirmed, current.confirmed, recovered, deaths)) %>%
  gather(key=type, value=count, -c(country, date))
## set factor levels to show them in a desirable order
data.long %<>% mutate(type=recode_factor(type, confirmed='Total Confirmed',
                                         current.confirmed='Current Confirmed',
                                         recovered='Recovered',
                                         deaths='Deaths'))
View(data.long)
##Number of case World
world <- filter(data.long,country == 'World')
plot1 <- world %>% filter(type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count)) +
  geom_area(aes(fill=type), alpha=0.5) +
  labs(title=paste0('Numbers of Cases Worldwide - ', max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=7),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=6),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1))
plot2 <- world %>%
  ggplot(aes(x=date, y=count)) +
  geom_line(aes(color=type)) +
  labs(title=paste0('Numbers of Cases Worldwide (log scale) - ', max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=14),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=14),
        axis.text=element_text(size=14),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)

plot2

## Current Confirmed Cases
data.world <- data %>% filter(country=='World')
n <- nrow(data.world)
plot1 <- ggplot(data.world, aes(x=date, y=current.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Current Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=new.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Daily New Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)

View(data.world)
## a scatter plot with a smoothed line and vertical x-axis labels
plot1 <- ggplot(data.world, aes(x=date, y=deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot3 <- ggplot(data.world, aes(x=date, y=new.deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot4 <- ggplot(data.world, aes(x=date, y=new.recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show four plots together, with 2 plots in each row
grid.arrange(plot1, plot2, plot3, plot4, nrow=2)

## convert from wide to long format, for drawing area plots
rates.long <- data %>%
  select(c(country, date, rate.upper, rate.lower, rate.daily)) %>%
  gather(key=type, value=count, -c(country, date))
# set factor levels to show them in a desirable order
rates.long %<>% mutate(type=recode_factor(type, rate.daily='Daily',
                             rate.upper='Upper bound'))
## ranking by confirmed cases
data.latest.all <- data %>% filter(date == max(date)) %>%
  select(country, date,confirmed, new.confirmed, current.confirmed,
         recovered, deaths, new.deaths, death.rate=rate.lower) %>%
  mutate(ranking = dense_rank(desc(confirmed)))
#View(data.latest.all)
k <- 20
## top 20 countries: 21 incl. 'World'
top.countries <- data.latest.all %>% filter(ranking <= k + 1) %>%
  arrange(ranking) %>% pull(country) %>% as.character()
top.countries %>% setdiff('World') %>% print()
 [1] "US"             "India"          "Brazil"         "Russia"         "United Kingdom"
 [6] "France"         "Turkey"         "Italy"          "Spain"          "Germany"       
[11] "Colombia"       "Argentina"      "Mexico"         "Poland"         "Iran"          
[16] "South Africa"   "Ukraine"        "Peru"           "Netherlands"    "Indonesia"     
data.latest <- data.latest.all %>% filter(!is.na(country)) %>%
  mutate(country=ifelse(ranking <= k + 1, as.character(country), 'Others')) %>%
  mutate(country=country %>% factor(levels=c(top.countries, 'Others')))
data.latest %<>% group_by(country) %>%
  summarise(confirmed=sum(confirmed), new.confirmed=sum(new.confirmed),
            current.confirmed=sum(current.confirmed),
            recovered=sum(recovered), deaths=sum(deaths), new.deaths=sum(new.deaths)) %>%
  mutate(death.rate=(100 * deaths/confirmed) %>% round(1)) 
`summarise()` ungrouping output (override with `.groups` argument)
data.latest
data.latest %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths, current.confirmed,recovered)) %>%
  mutate(recover.rate=(100 * recovered/confirmed) %>% round(1))
data.latest
df_pop <- tbl(my_db, sql("select * from population "))
df_pop <- as.data.frame(df_pop)
df_pop <- rename(df_pop,"country"="Country")
df_pop
data.latest <- merge(x = data.latest, y = df_pop, by = "country", all.x = TRUE) 
data.latest <- rename(data.latest,"population" = "Population (2020)")
data.latest
data.latest  %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths,
                          current.confirmed,recovered,recover.rate,population)) %>%
  mutate(confirm.rate=(100 *confirmed/population) %>% round(1))
data.latest
NA
data.latest %>% mutate(death.rate=death.rate %>% format(nsmall=1) %>% paste0('%'))
NA
NA
## convert from wide to long format, for drawing area plots
data.latest.long <- data.latest %>% filter(country!='World') %>%
  gather(key=type, value=count, -country)
## set factor levels to show them with proper text and in a desirable order
data.latest.long %<>% mutate(type=recode_factor(type,
                                                confirmed='Total Confirmed',
                                                deaths='Total Deaths',
                                                death.rate='Death Rate (%)',
                                                new.confirmed='New Confirmed (compared with one day before)',
                                                new.deaths='New Deaths (compared with one day before)',
                                                current.confirmed='Current Confirmed',
                                                recover.rate = 'Recover Rate(%)',
                                                confirm.rate = 'Confirmed Rate(%)'))
#View(data.latest.long)
data.one.dem <- filter(data.latest.long,type=='Total Confirmed'
                       | type=='Total Deaths'
                       | type=='Current Confirmed')
data.two.dem <- filter(data.latest.long,type=='Death Rate (%)'
                       | type=='New Confirmed (compared with one day before)'
                       | type=='New Deaths (compared with one day before)'
                       | type=='Recover Rate(%)'
                       | type=='Confirmed Rate(%)')
data.two.dem
## bar chart
data.one.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')


data.two.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')

##GDP
df_gdp <- tbl(my_db, sql("select * from gdp"))
df_gdp <- as.data.frame(df_gdp)
df_gdp <- rename(df_gdp,"country"="Real GDP growth (Annual percent change)")
df_gdp <- select(df_gdp,c("country","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021"))
df_gdp
df_gdp2019 <- tbl(my_db, sql("select * from gdp19"))
df_gdp2019 <- as.data.frame(df_gdp2019)
df_gdp2019
NA
#healthranking
df_healt <- tbl(my_db, sql("select * from healthranking"))
df_healt <- as.data.frame(df_healt)
df_healt <- select(df_healt,c("country","healthCareIndex"))
df_healt
#Top20Pornhub
df_pornhub <- tbl(my_db, sql("select * from Pornhub"))
df_pornhub <- as.data.frame(df_pornhub)
df_pornhub
NA
#temp
df_temp <- tbl(my_db, sql("select * from Avg_World_Temp_2020"))
df_temp <- as.data.frame(df_temp)
df_city <- select(df_temp,c("Country","City")) %>%
  rename(country=Country) %>% 
  rename(city=City)
numofcity <- aggregate(city ~ country, data = df_city, length)
df_temp <- select(df_temp,c("Country","Apr","May","Jun","Jul","Aug")) %>%
  rename(country=Country)
df_temp <- data.frame(country=df_temp[,1],avg=rowMeans(df_temp[,-1]))
df_temp <- df_temp %<>% group_by(country) %>% summarise(avg_temp = mean(avg,na.rm = TRUE))
`summarise()` ungrouping output (override with `.groups` argument)
df_temp <- df_temp %>% mutate(country=ifelse(country=="United States","US", country ) ) 
df_temp$avg_temp <- df_temp$avg_temp %>% 
  sprintf(df_temp$avg_temp, fmt = '%#.1f') %>%
  as.numeric(df_temp$avg_temp)
df_temp
#Top 20 with gdp
data.longGDP <- df_gdp %>% gather(key=year, value=GDP, -c(country))
data.top <- data.latest %>% filter(country!='World')
data.top <- head(data.top,20)
#View(data.top)
data.gdp <- filter(data.longGDP,year=='2020')
#View(data.gdp)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "country", all.x = TRUE) 
  return(data)
}
data.top.world <- merge(x = data.top, y = df_gdp2019, by = "country", all.x = TRUE) %>% 
  select(-c(code,rank,new.confirmed,new.deaths,current.confirmed,population)) %>% 
  rename(GDP="GDP (millions of US dollars)")

data.top.world <- merge(x = data.top.world, y = df_healt, by = "country", all.x = TRUE) %>%
  rename(healthcare="healthCareIndex")
#data.top.world <- mergcountry(data.top.world, df_temp)

data.top.world <- merge(x = data.top.world, y = df_pornhub, by = "country", all.x = TRUE) %>%
  rename(Pornhub = "PornhubIndex(%)")

data.top.world <- mergcountry(data.top.world, df_temp)
index <- is.na(data.top.world)
data.top.world[index] <- 0
data.top.world
#View(data.top.world)

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
norm_data = as.data.frame(apply(data.top.world[,2:12],2,normalize))
corr_data <- norm_data
norm_data$country <- c("Argentina","Bangladesh","Brazil","Chile","Colombia","France","Germany","India","Iran","Italy","Mexico","Pakistan","Peru","Russia","saudi Arabia","South Africa","Spain","Turkey","United Kingdom","US")
#View(norm_data)


norm_data_plot <- select(norm_data,"country","confirm.rate","death.rate","recover.rate","healthcare","Pornhub","GDP","avg_temp")
norm_data_plot %<>% gather(key=type, value=count, -c(country))
level_order <- factor(norm_data_plot$type, 
                      level = c("GDP","avg_temp","healthcare","recover.rate","death.rate","confirm.rate","Pornhub"))
ggplot(data = norm_data_plot, aes(x=country, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )

NA
NA
#rank GDP
data.top.hight <- data.gdp %>% select(country, year,GDP) %>%
  mutate(ranking = dense_rank(desc(GDP)))
data.top.hight
k <- 15
top.gdp <- data.top.hight %>% 
  #filter(ranking <= k + 1) %>% 
  arrange(ranking)
top.gdp <- head(top.gdp,21)
data.top.low <- data.gdp %>% select(country, year,GDP) %>%
  mutate(ranking = dense_rank(GDP))
low.gdp.long <- data.top.low %>% 
  #filter(ranking <= k + 1) %>% 
  arrange(ranking)
View(low.gdp.long)
low.gdp <- head(low.gdp.long,23)
low.gdp
#correlation
corr_data %<>% select(c(GDP,confirm.rate,death.rate,recover.rate,healthcare,avg_temp,Pornhub))
head(corr_data)
cor(corr_data)
                    GDP confirm.rate  death.rate recover.rate  healthcare    avg_temp
GDP           1.0000000    0.4350394 -0.23717777   -0.4763494  0.25183938  0.15986445
confirm.rate  0.4350394    1.0000000 -0.34002060   -0.7254358  0.46302123 -0.38053574
death.rate   -0.2371778   -0.3400206  1.00000000    0.1832321 -0.16021245  0.31063014
recover.rate -0.4763494   -0.7254358  0.18323215    1.0000000 -0.73011486  0.12398936
healthcare    0.2518394    0.4630212 -0.16021245   -0.7301149  1.00000000 -0.05087304
avg_temp      0.1598645   -0.3805357  0.31063014    0.1239894 -0.05087304  1.00000000
Pornhub       0.6997223    0.4557707 -0.07822766   -0.4581646  0.35166155  0.10674647
                 Pornhub
GDP           0.69972226
confirm.rate  0.45577066
death.rate   -0.07822766
recover.rate -0.45816457
healthcare    0.35166155
avg_temp      0.10674647
Pornhub       1.00000000
ggcorrplot(cor(corr_data),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

df <- data.long %>% filter(country %in% top.countries) %<>%
  mutate(country=country %>% factor(levels=c(top.countries)))
df %>% filter(country != 'World' & type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count, fill=type)) +
  geom_area(alpha=0.5) +
# xlab('') + ylab('') +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries - ',
                    max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=12),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=12),
        strip.text.x=element_text(size=12),
        axis.text=element_text(size=12),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~country, ncol=4, scales='free_y')

p <- df %>% filter(country != 'World') %>%
  ggplot(aes(x=date, y=count, color=type)) +
  geom_line() +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries (log scale) - ',
                    max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=10),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=10),
        strip.text.x=element_text(size=10),
        axis.text=element_text(size=10),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
p + facet_wrap(~country, ncol=4, scales='free_y')

data.world %<>% arrange(desc(date)) %>%
  select(c(date, confirmed, deaths, recovered, current.confirmed,new.confirmed, new.deaths, new.recovered, rate.lower, rate.upper, rate.daily))
data.world %>%
  mutate(rate.upper = rate.upper %>% format(nsmall=1) %>% paste0('\\%'),
         rate.lower = rate.lower %>% format(nsmall=1) %>% paste0('\\%'),
         rate.daily = rate.daily %>% format(nsmall=1) %>% paste0('\\%')) 
#sars_2003
df_sars <- tbl(my_db, sql("select * from sars_2003_update"))
df_sars <- as.data.frame(df_sars)
df_sars
#datesSar <- as.Date(df_sars$Date,format = "%m/%d/%y")
#datesSar

df_sars %<>%  mutate(Date = as.Date(df_sars$Date,format = "%m/%d/%y"))
df_sars
## convert from wide to long format
dataSar.long <- df_sars %>%
  select(c(Date, country, Cumulative_number , Number_deaths, Number_recovered)) %>%
  gather(key=type, value=count, -c(country, Date))
## set factor levels to show them in a desirable order
dataSar.long %<>% mutate(type=recode_factor(type, Cumulative_number ='Cumulative Number',
                                         Number_deaths ='Number of deaths',
                                         Number_recovered ='Number of recovered'))
View(dataSar.long)
 
g <-
  ggplot(dataSar.long,aes(Date,count,color = type)) +
  geom_line()+
  geom_point()+
  xlab("")+
  ylab("")
g

NA
#Covid_Thailand
df_thai <- tbl(my_db, sql("select * from covid_Thailand"))
df_thai <- as.data.frame(df_thai)
View(df_thai)
#clean Covid_Thailand
dates.th <- df_thai[,2]%>% mdy()
range(dates.th)
[1] "2020-01-12" "2021-01-15"
min.date.th <- min(dates.th)
max.date.th <- max(dates.th)
min.date.txt.th <- min.date.th %>% format('%d %b %Y')
max.date.txt.th <- max.date.th %>% format('%d %b %Y')
df_thai$announce_date <- mdy(df_thai$announce_date)
df_thai$notification_date <- mdy(df_thai$notification_date)
df_thai 
df_thai <- df_thai %>% select(!No.) %>% select(!notification_date) %>% 
  group_by(announce_date)
df_thai
# Total confirmed cases in Thailand
data.thai.count <- df_thai %>%
  select(announce_date) %>%
  summarise(comfirmed = n())  %>% as.data.frame()
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.count$cumulative_confirmed <- cumsum(data.thai.count[, 2])
data.thai.count
## Thai Confirmed Cases (Jan 2020 - Jan 2021
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)

## Thai Confirmed Cases (Jan 2020 - Jan 2021) log scale
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)

# Confirmed cases divided by sex (gender)
data.thai.gender <- df_thai %>%
  group_by(sex) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>1)%>%
  #mutate(pos = cumsum(percent) - 0.5*percent) %>%
  arrange(desc(percent))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.gender
data.thai.gender$sex <- factor(data.thai.gender$sex, levels = as.character(data.thai.gender$sex))
data.thai.gender$sex
[1] Male   Female
Levels: Male Female
g.th.gender <- data.thai.gender %>% 
  ggplot(aes(x = "", y = percent, fill = sex)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title='Gender of Thai Confirmed Cases (Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "white", size = 5, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.gender

# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.risk
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>3.5) %>%
  arrange(desc(percent)) 
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.risk
data.thai.risk$risk[data.thai.risk$risk == "C"] <- "Close contact with the patient"
data.thai.risk$risk[data.thai.risk$risk == "F"] <- "State Quarantine"
data.thai.risk$risk[data.thai.risk$risk == "O"] <- "AOQ/ALQ/HQ/AHQ/OQ"
data.thai.risk$risk[data.thai.risk$risk == "G"] <- "Go to a crowded place"
data.thai.risk$risk[data.thai.risk$risk == "B"] <- "Thai people from abroad"
data.thai.risk$risk[data.thai.risk$risk == "D"] <- "Career at risk"
data.thai.risk$risk[data.thai.risk$risk == "H"] <- "Cabaret"
data.thai.risk
data.thai.risk$risk <- factor(data.thai.risk$risk, levels = as.character(data.thai.risk$risk))
data.thai.risk$risk
 [1] Close contact with the patient Cluster Samut Sakhon           State Quarantine               Unknown                       
 [5] AOQ/ALQ/HQ/AHQ/OQ              Go to a crowded place          Thai people from abroad        Cluster Rayong                
 [9] Career at risk                 Cabaret                       
10 Levels: Close contact with the patient Cluster Samut Sakhon State Quarantine Unknown AOQ/ALQ/HQ/AHQ/OQ ... Cabaret
g.th.risk <- data.thai.risk %>% 
  ggplot(aes(x = "", y = percent, fill = risk)) +
  geom_bar(stat = "identity", width = 0.5) +
  coord_polar("y") +
  theme_void() +
  labs(title='Risk of Thai Confirmed Cases(Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "Black", size = 3, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.risk

# Confirmed cases divided by age
data.thai.age <- df_thai %>%
  group_by(age,sex) %>% 
  filter(sex != "")%>%
  summarise(count = n()) %>%
  arrange(desc(count))
`summarise()` regrouping output by 'age' (override with `.groups` argument)
data.thai.age
ggplot(data.thai.age,aes(x=age,y=count,fill=sex))+geom_bar(stat = "identity")+
  labs(title='Age of Thai Confirmed Cases')+guides(fill=guide_legend(reverse = T))

# Confirmed cases divided by nationality

data.thai.nationality <- df_thai %>%
  group_by(nationality) %>%
  summarise(count = n()) %>%
  filter(count > 11)%>%
  arrange(desc(count))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.nationality$nationality[data.thai.nationality$nationality == "????????"] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == ""] <- "Unknown"
data.thai.nationality
ggplot(data.thai.nationality,aes(x=nationality,y=count))+geom_bar(stat = "identity")+
  labs(title='Nationality of Thai Confirmed Cases')+coord_flip()

#Covid_US
df_us <- tbl(my_db, sql("select * from covidUs"))
df_us <- as.data.frame(df_us)
df_us
#clean Covid_US
dates.us <- df_us[,2]%>% mdy()
range(dates.us)
[1] "2020-01-21" "2020-12-24"
min.date.us <- min(dates.us)
max.date.us <- max(dates.us)
min.date.txt.us <- min.date.us %>% format('%d %b %Y')
max.date.txt.us <- max.date.us %>% format('%d %b %Y')
df_us$date <- mdy(df_us$date)
df_us
df_us <- df_us %>% select(!MyUnknownColumn) %>% select(!fips) %>% 
  group_by(date)
df_us
## convert from wide to long format
data.long.us <- df_us %>%
  select(c(date, state, cases, deaths)) %>%
  gather(key=type, value=count, -c(date, state))
## set factor levels to show them in a desirable order
data.long.us %<>% mutate(type=recode_factor(type, cases='Confirmed',
                                         deaths='Deaths'))

View(data.long.us)
ggplot(data.long.us,aes(x=date,y=count))+
  geom_line(aes(color=type))

#gender in us
df_gender_us <- tbl(my_db, sql("select * from data_gender"))
df_gender_us <- as.data.frame(df_gender_us)
df_gender_us <- select(df_gender_us,c("State","Male","Female"))
df_gender_us <- rename(df_gender_us,"state"="State")
df_gender_us
#population in us
df_pop_us <- tbl(my_db, sql("select * from data_population"))
df_pop_us <- as.data.frame(df_pop_us)
df_pop_us <- select(df_pop_us,c("State","Population"))
df_pop_us <- rename(df_pop_us,"state"="State")
df_pop_us
#lockdown in us
df_lockdown_us <- tbl(my_db, sql("select * from data_lockdown"))
df_lockdown_us <- as.data.frame(df_lockdown_us)
df_lockdown_us <- select(df_lockdown_us,c("State","Day lockdown"))
df_lockdown_us <- rename(df_lockdown_us,"state"="State")
df_lockdown_us
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df_Allus <- merge(x = df_us, y = df_gender_us, by = "state", all.x = TRUE) 

df_Allus <- merge(x = df_Allus, y = df_pop_us, by = "state", all.x = TRUE) 

df_Allus <- merge(x = df_Allus, y = df_lockdown_us, by = "state", all.x = TRUE) 

View(df_Allus)
index <- is.na(df_Allus)
df_Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df_Allus[,3:8],2,normalize))
corr_dataUS <- Allus 

View(Allus)
Allus$state <- c(df_Allus$state)
View(Allus)


Allus_plot <- select(Allus,"state","cases","deaths","Male","Female","Population","Day lockdown")
Allus_plot %<>% gather(key=type, value=count, -c(state))
level_order <- factor(Allus_plot$type, 
                      level = c("cases","deaths","Male","Female","Population","Day lockdown"))
ggplot(data = Allus_plot, aes(x=state, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )

NA
NA
NA
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown))
head(corr_dataUS)
cor(corr_dataUS)
                cases    deaths      Male    Female Population Daylockdown
cases       1.0000000 0.8771162 0.1273080 0.1904992  0.6171785   0.2038334
deaths      0.8771162 1.0000000 0.1147696 0.1847547  0.6601563   0.2104893
Male        0.1273080 0.1147696 1.0000000 0.9513769  0.1956974   0.5064869
Female      0.1904992 0.1847547 0.9513769 1.0000000  0.2679789   0.5403218
Population  0.6171785 0.6601563 0.1956974 0.2679789  1.0000000   0.3394907
Daylockdown 0.2038334 0.2104893 0.5064869 0.5403218  0.3394907   1.0000000
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

---
title: "R Notebook"
output: html_notebook
---
```{r}
#install.packages("knitr")
#install.packages("grid")
install.packages("plotly")
#install.packages("dprep")
#install.packages("normalr")
#install.packages("ggcorrplot")

#install.packages("RColorBrewer")
#install.packages("rgdal")
#install.packages("jsonlite")
#install.packages("readr")
#install.packages("readr")
```

```{r}


library(gridExtra)
library(dplyr)
library(lubridate)
library(magrittr)
library(ggplot2)
library(tidyr)
library(knitr)
library(normalr)
library(ggcorrplot)

library(leaflet)
library(plotly)
library(RColorBrewer)
library(readr)
#library(MLRMPA)
#??src_mysql
my_db <- src_mysql(
  dbname = "coronavirus",
  host = "localhost",
  user = "root",
  password = "1234"
)
my_db

##import data
df_conf <- tbl(my_db, sql("select * from covid19_confirmed "))
df_conf <- as.data.frame(df_conf)
df_conf
df_deaths <- tbl(my_db, sql("select * from covid19_deaths "))
df_deaths <- as.data.frame(df_deaths)
df_deaths
df_recover <- tbl(my_db, sql("select * from covid19_recovered "))
df_recover <- as.data.frame(df_recover)
df_recover
```
```{r}
##check the time frame of the data
n.col <- ncol(df_conf)
dates <- names(df_conf)[5:n.col]%>% mdy()
range(dates)
min.date <- min(dates)
max.date <- max(dates)
min.date.txt <- min.date %>% format('%d %b %Y')
max.date.txt <- max.date %>% format('%d %b %Y')
```
```{r}
#clean data
cleanData <- function(data) {
  ## remove some columns
  data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
  ## convert from wide to long format
  data %<>% gather(key=date, value=count, -country)
  ## convert from character to date
  data %<>% mutate(date = date %>% mdy())
  ## aggregate by country
  data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
  return(data)
}
## clean the three data sets
data.confirmed <- df_conf %>% cleanData() %>% rename(confirmed=count)
data.deaths <- df_deaths %>% cleanData() %>% rename(deaths=count)
data.recovered <- df_recover %>% cleanData() %>% rename(recovered=count)
data <- data.confirmed %>% merge(data.deaths, all=T) %>% merge(data.recovered, all=T)
data
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
data 
```


```{r}
data.world <- data %>% group_by(date) %>%
  summarise(country='World',
            confirmed = sum(confirmed, na.rm=T),
            deaths = sum(deaths, na.rm=T),
            recovered = sum(recovered, na.rm=T))
data %<>% rbind(data.world)
data
data %<>% mutate(current.confirmed = confirmed - deaths - recovered)
data

```
```{r}
#rate
data %<>% arrange(country, date)
n <- nrow(data)
day1 <- min(data$date)
data %<>% mutate(new.confirmed = ifelse(date == day1, NA, confirmed - lag(confirmed, n=1)),
                 new.deaths = ifelse(date == day1, NA, deaths - lag(deaths, n=1)),
                 new.recovered = ifelse(date == day1, NA, recovered - lag(recovered, n=1)))
data %<>% mutate(new.confirmed = ifelse(new.confirmed < 0, 0, new.confirmed),
                 new.deaths = ifelse(new.deaths < 0, 0, new.deaths),
                 new.recovered = ifelse(new.recovered < 0, 0, new.recovered))
## death rate based on total deaths and recovered cases
data %<>% mutate(rate.upper = (100 * deaths / (deaths + recovered)) %>% round(1))
## lower bound: death rate based on total confirmed cases
data %<>% mutate(rate.lower = (100 * deaths / confirmed) %>% round(1))
## death rate based on the number of death/recovered on every single day
data %<>% mutate(rate.daily = (100 * new.deaths / (new.deaths + new.recovered)) %>% round(1))
View(data)
```
```{r}
## convert from wide to long format
data.long <- data %>%
  select(c(country, date, confirmed, current.confirmed, recovered, deaths)) %>%
  gather(key=type, value=count, -c(country, date))
## set factor levels to show them in a desirable order
data.long %<>% mutate(type=recode_factor(type, confirmed='Total Confirmed',
                                         current.confirmed='Current Confirmed',
                                         recovered='Recovered',
                                         deaths='Deaths'))
View(data.long)
```
```{r}
##Number of case World
world <- filter(data.long,country == 'World')
plot1 <- world %>% filter(type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count)) +
  geom_area(aes(fill=type), alpha=0.5) +
  labs(title=paste0('Numbers of Cases Worldwide - ', max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=7),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=6),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1))
plot2 <- world %>%
  ggplot(aes(x=date, y=count)) +
  geom_line(aes(color=type)) +
  labs(title=paste0('Numbers of Cases Worldwide (log scale) - ', max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=14),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=14),
        axis.text=element_text(size=14),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)
```


```{r}
plot2
```
```{r}
## Current Confirmed Cases
data.world <- data %>% filter(country=='World')
n <- nrow(data.world)
plot1 <- ggplot(data.world, aes(x=date, y=current.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Current Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=new.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Daily New Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)
View(data.world)
```
```{r}
## a scatter plot with a smoothed line and vertical x-axis labels
plot1 <- ggplot(data.world, aes(x=date, y=deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot3 <- ggplot(data.world, aes(x=date, y=new.deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot4 <- ggplot(data.world, aes(x=date, y=new.recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show four plots together, with 2 plots in each row
grid.arrange(plot1, plot2, plot3, plot4, nrow=2)
```



```{r}
## convert from wide to long format, for drawing area plots
rates.long <- data %>%
  select(c(country, date, rate.upper, rate.lower, rate.daily)) %>%
  gather(key=type, value=count, -c(country, date))
# set factor levels to show them in a desirable order
rates.long %<>% mutate(type=recode_factor(type, rate.daily='Daily',
                             rate.upper='Upper bound'))
```
```{r}
## ranking by confirmed cases
data.latest.all <- data %>% filter(date == max(date)) %>%
  select(country, date,confirmed, new.confirmed, current.confirmed,
         recovered, deaths, new.deaths, death.rate=rate.lower) %>%
  mutate(ranking = dense_rank(desc(confirmed)))
#View(data.latest.all)
k <- 20
## top 20 countries: 21 incl. 'World'
top.countries <- data.latest.all %>% filter(ranking <= k + 1) %>%
  arrange(ranking) %>% pull(country) %>% as.character()
top.countries %>% setdiff('World') %>% print()

```

```{r}
data.latest <- data.latest.all %>% filter(!is.na(country)) %>%
  mutate(country=ifelse(ranking <= k + 1, as.character(country), 'Others')) %>%
  mutate(country=country %>% factor(levels=c(top.countries, 'Others')))
data.latest %<>% group_by(country) %>%
  summarise(confirmed=sum(confirmed), new.confirmed=sum(new.confirmed),
            current.confirmed=sum(current.confirmed),
            recovered=sum(recovered), deaths=sum(deaths), new.deaths=sum(new.deaths)) %>%
  mutate(death.rate=(100 * deaths/confirmed) %>% round(1)) 
data.latest
data.latest %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths, current.confirmed,recovered)) %>%
  mutate(recover.rate=(100 * recovered/confirmed) %>% round(1))
data.latest
df_pop <- tbl(my_db, sql("select * from population "))
df_pop <- as.data.frame(df_pop)
df_pop <- rename(df_pop,"country"="Country")
df_pop
data.latest <- merge(x = data.latest, y = df_pop, by = "country", all.x = TRUE) 
data.latest <- rename(data.latest,"population" = "Population (2020)")
data.latest
data.latest  %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths,
                          current.confirmed,recovered,recover.rate,population)) %>%
  mutate(confirm.rate=(100 *confirmed/population) %>% round(1))
data.latest

```
```{r}
data.latest %>% mutate(death.rate=death.rate %>% format(nsmall=1) %>% paste0('%'))


```

```{r}
## convert from wide to long format, for drawing area plots
data.latest.long <- data.latest %>% filter(country!='World') %>%
  gather(key=type, value=count, -country)
## set factor levels to show them with proper text and in a desirable order
data.latest.long %<>% mutate(type=recode_factor(type,
                                                confirmed='Total Confirmed',
                                                deaths='Total Deaths',
                                                death.rate='Death Rate (%)',
                                                new.confirmed='New Confirmed (compared with one day before)',
                                                new.deaths='New Deaths (compared with one day before)',
                                                current.confirmed='Current Confirmed',
                                                recover.rate = 'Recover Rate(%)',
                                                confirm.rate = 'Confirmed Rate(%)'))
#View(data.latest.long)
data.one.dem <- filter(data.latest.long,type=='Total Confirmed'
                       | type=='Total Deaths'
                       | type=='Current Confirmed')
data.two.dem <- filter(data.latest.long,type=='Death Rate (%)'
                       | type=='New Confirmed (compared with one day before)'
                       | type=='New Deaths (compared with one day before)'
                       | type=='Recover Rate(%)'
                       | type=='Confirmed Rate(%)')
data.two.dem
```

```{r}
## bar chart
data.one.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')

```
```{r}

data.two.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')
```

```{r}
##GDP
df_gdp <- tbl(my_db, sql("select * from gdp"))
df_gdp <- as.data.frame(df_gdp)
df_gdp <- rename(df_gdp,"country"="Real GDP growth (Annual percent change)")
df_gdp <- select(df_gdp,c("country","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021"))
df_gdp
df_gdp2019 <- tbl(my_db, sql("select * from gdp19"))
df_gdp2019 <- as.data.frame(df_gdp2019)
df_gdp2019

```
```{r}
#healthranking
df_healt <- tbl(my_db, sql("select * from healthranking"))
df_healt <- as.data.frame(df_healt)
df_healt <- select(df_healt,c("country","healthCareIndex"))
df_healt
```
```{r}
#Top20Pornhub
df_pornhub <- tbl(my_db, sql("select * from Pornhub"))
df_pornhub <- as.data.frame(df_pornhub)
df_pornhub

```


```{r}
#temp
df_temp <- tbl(my_db, sql("select * from Avg_World_Temp_2020"))
df_temp <- as.data.frame(df_temp)
df_city <- select(df_temp,c("Country","City")) %>%
  rename(country=Country) %>% 
  rename(city=City)
numofcity <- aggregate(city ~ country, data = df_city, length)
df_temp <- select(df_temp,c("Country","Apr","May","Jun","Jul","Aug")) %>%
  rename(country=Country)
df_temp <- data.frame(country=df_temp[,1],avg=rowMeans(df_temp[,-1]))
df_temp <- df_temp %<>% group_by(country) %>% summarise(avg_temp = mean(avg,na.rm = TRUE))
df_temp <- df_temp %>% mutate(country=ifelse(country=="United States","US", country ) ) 
df_temp$avg_temp <- df_temp$avg_temp %>% 
  sprintf(df_temp$avg_temp, fmt = '%#.1f') %>%
  as.numeric(df_temp$avg_temp)
df_temp
```



```{r}
#Top 20 with gdp
data.longGDP <- df_gdp %>% gather(key=year, value=GDP, -c(country))
data.top <- data.latest %>% filter(country!='World')
data.top <- head(data.top,20)
#View(data.top)
data.gdp <- filter(data.longGDP,year=='2020')
#View(data.gdp)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "country", all.x = TRUE) 
  return(data)
}
data.top.world <- merge(x = data.top, y = df_gdp2019, by = "country", all.x = TRUE) %>% 
  select(-c(code,rank,new.confirmed,new.deaths,current.confirmed,population)) %>% 
  rename(GDP="GDP (millions of US dollars)")

data.top.world <- merge(x = data.top.world, y = df_healt, by = "country", all.x = TRUE) %>%
  rename(healthcare="healthCareIndex")
#data.top.world <- mergcountry(data.top.world, df_temp)

data.top.world <- merge(x = data.top.world, y = df_pornhub, by = "country", all.x = TRUE) %>%
  rename(Pornhub = "PornhubIndex(%)")

data.top.world <- mergcountry(data.top.world, df_temp)
index <- is.na(data.top.world)
data.top.world[index] <- 0
data.top.world
#View(data.top.world)

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
norm_data = as.data.frame(apply(data.top.world[,2:12],2,normalize))
corr_data <- norm_data
norm_data$country <- c("Argentina","Bangladesh","Brazil","Chile","Colombia","France","Germany","India","Iran","Italy","Mexico","Pakistan","Peru","Russia","saudi Arabia","South Africa","Spain","Turkey","United Kingdom","US")
#View(norm_data)


norm_data_plot <- select(norm_data,"country","confirm.rate","death.rate","recover.rate","healthcare","Pornhub","GDP","avg_temp")
norm_data_plot %<>% gather(key=type, value=count, -c(country))
level_order <- factor(norm_data_plot$type, 
                      level = c("GDP","avg_temp","healthcare","recover.rate","death.rate","confirm.rate","Pornhub"))
ggplot(data = norm_data_plot, aes(x=country, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )

  
```


```{r}
#rank GDP
data.top.hight <- data.gdp %>% select(country, year,GDP) %>%
  mutate(ranking = dense_rank(desc(GDP)))
data.top.hight
k <- 15
top.gdp <- data.top.hight %>% 
  #filter(ranking <= k + 1) %>% 
  arrange(ranking)
top.gdp <- head(top.gdp,21)
data.top.low <- data.gdp %>% select(country, year,GDP) %>%
  mutate(ranking = dense_rank(GDP))
low.gdp.long <- data.top.low %>% 
  #filter(ranking <= k + 1) %>% 
  arrange(ranking)
View(low.gdp.long)
low.gdp <- head(low.gdp.long,23)
low.gdp
```



```{r}
#correlation
corr_data %<>% select(c(GDP,confirm.rate,death.rate,recover.rate,healthcare,avg_temp,Pornhub))
head(corr_data)
cor(corr_data)
ggcorrplot(cor(corr_data),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)
```
```{r}
df <- data.long %>% filter(country %in% top.countries) %<>%
  mutate(country=country %>% factor(levels=c(top.countries)))
df %>% filter(country != 'World' & type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count, fill=type)) +
  geom_area(alpha=0.5) +
# xlab('') + ylab('') +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries - ',
                    max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=12),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=12),
        strip.text.x=element_text(size=12),
        axis.text=element_text(size=12),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~country, ncol=4, scales='free_y')

```
```{r}
p <- df %>% filter(country != 'World') %>%
  ggplot(aes(x=date, y=count, color=type)) +
  geom_line() +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries (log scale) - ',
                    max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=10),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=10),
        strip.text.x=element_text(size=10),
        axis.text=element_text(size=10),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
p + facet_wrap(~country, ncol=4, scales='free_y')
```
```{r}
data.world %<>% arrange(desc(date)) %>%
  select(c(date, confirmed, deaths, recovered, current.confirmed,new.confirmed, new.deaths, new.recovered, rate.lower, rate.upper, rate.daily))
data.world %>%
  mutate(rate.upper = rate.upper %>% format(nsmall=1) %>% paste0('\\%'),
         rate.lower = rate.lower %>% format(nsmall=1) %>% paste0('\\%'),
         rate.daily = rate.daily %>% format(nsmall=1) %>% paste0('\\%')) 
```
```{r}
#sars_2003
df_sars <- tbl(my_db, sql("select * from sars_2003_update"))
df_sars <- as.data.frame(df_sars)
df_sars
```


```{r}
## convert from character to date
#datesSar <- as.Date(df_sars$Date,format = "%m/%d/%y")

df_sars %<>%  mutate(Date = as.Date(df_sars$Date,format = "%m/%d/%y"))
df_sars
```

```{r}
## convert from wide to long format
dataSar.long <- df_sars %>%
  select(c(Date, country, Cumulative_number , Number_deaths, Number_recovered)) %>%
  gather(key=type, value=count, -c(country, Date))
## set factor levels to show them in a desirable order
dataSar.long %<>% mutate(type=recode_factor(type, Cumulative_number ='Cumulative Number',
                                         Number_deaths ='Number of deaths',
                                         Number_recovered ='Number of recovered'))
View(dataSar.long)
 
```




```{r}
g <-
  ggplot(dataSar.long,aes(Date,count,color = type)) +
  geom_line()+
  geom_point()+
  xlab("")+
  ylab("")
g
  
```
```{r}
#Covid_Thailand
df_thai <- tbl(my_db, sql("select * from covid_Thailand"))
df_thai <- as.data.frame(df_thai)
View(df_thai)
```

```{r}
#clean Covid_Thailand
dates.th <- df_thai[,2]%>% mdy()
range(dates.th)
min.date.th <- min(dates.th)
max.date.th <- max(dates.th)
min.date.txt.th <- min.date.th %>% format('%d %b %Y')
max.date.txt.th <- max.date.th %>% format('%d %b %Y')
```
```{r}
df_thai$announce_date <- mdy(df_thai$announce_date)
df_thai$notification_date <- mdy(df_thai$notification_date)
df_thai 
```
```{r}
df_thai <- df_thai %>% select(!No.) %>% select(!notification_date) %>% 
  group_by(announce_date)
df_thai
```
```{r}
# Total confirmed cases in Thailand
data.thai.count <- df_thai %>%
  select(announce_date) %>%
  summarise(comfirmed = n())  %>% as.data.frame()
data.thai.count$cumulative_confirmed <- cumsum(data.thai.count[, 2])
data.thai.count
```


```{r}
## Thai Confirmed Cases (Jan 2020 - Jan 2021
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)
```
```{r}
## Thai Confirmed Cases (Jan 2020 - Jan 2021) log scale
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)
```

```{r}
# Confirmed cases divided by sex (gender)
data.thai.gender <- df_thai %>%
  group_by(sex) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>1)%>%
  #mutate(pos = cumsum(percent) - 0.5*percent) %>%
  arrange(desc(percent))
data.thai.gender
```
```{r}
data.thai.gender$sex <- factor(data.thai.gender$sex, levels = as.character(data.thai.gender$sex))
data.thai.gender$sex
g.th.gender <- data.thai.gender %>% 
  ggplot(aes(x = "", y = percent, fill = sex)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title='Gender of Thai Confirmed Cases (Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "white", size = 5, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.gender
```

```{r}
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
data.thai.risk
```
```{r}
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>3.5) %>%
  arrange(desc(percent)) 
data.thai.risk
```
```{r}
data.thai.risk$risk[data.thai.risk$risk == "C"] <- "Close contact with the patient"
data.thai.risk$risk[data.thai.risk$risk == "F"] <- "State Quarantine"
data.thai.risk$risk[data.thai.risk$risk == "O"] <- "AOQ/ALQ/HQ/AHQ/OQ"
data.thai.risk$risk[data.thai.risk$risk == "G"] <- "Go to a crowded place"
data.thai.risk$risk[data.thai.risk$risk == "B"] <- "Thai people from abroad"
data.thai.risk$risk[data.thai.risk$risk == "D"] <- "Career at risk"
data.thai.risk$risk[data.thai.risk$risk == "H"] <- "Cabaret"
data.thai.risk
```


```{r}
data.thai.risk$risk <- factor(data.thai.risk$risk, levels = as.character(data.thai.risk$risk))
data.thai.risk$risk
g.th.risk <- data.thai.risk %>% 
  ggplot(aes(x = "", y = percent, fill = risk)) +
  geom_bar(stat = "identity", width = 0.5) +
  coord_polar("y") +
  theme_void() +
  labs(title='Risk of Thai Confirmed Cases(Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "Black", size = 3, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.risk
```



```{r}
# Confirmed cases divided by age
data.thai.age <- df_thai %>%
  group_by(age,sex) %>% 
  filter(sex != "")%>%
  summarise(count = n()) %>%
  arrange(desc(count))
data.thai.age
```
```{r}
ggplot(data.thai.age,aes(x=age,y=count,fill=sex))+geom_bar(stat = "identity")+
  labs(title='Age of Thai Confirmed Cases')+guides(fill=guide_legend(reverse = T))

```

```{r}
# Confirmed cases divided by nationality

data.thai.nationality <- df_thai %>%
  group_by(nationality) %>%
  summarise(count = n()) %>%
  filter(count > 11)%>%
  arrange(desc(count))
data.thai.nationality$nationality[data.thai.nationality$nationality == "????????"] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == ""] <- "Unknown"
data.thai.nationality
ggplot(data.thai.nationality,aes(x=nationality,y=count))+geom_bar(stat = "identity")+
  labs(title='Nationality of Thai Confirmed Cases')+coord_flip()
```
```{r}
#Covid_US
df_us <- tbl(my_db, sql("select * from covidUs"))
df_us <- as.data.frame(df_us)
df_us
```
```{r}
#clean Covid_US
dates.us <- df_us[,2]%>% mdy()
range(dates.us)
min.date.us <- min(dates.us)
max.date.us <- max(dates.us)
min.date.txt.us <- min.date.us %>% format('%d %b %Y')
max.date.txt.us <- max.date.us %>% format('%d %b %Y')
```

```{r}
df_us$date <- mdy(df_us$date)
df_us
```


```{r}
df_us <- df_us %>% select(!MyUnknownColumn) %>% select(!fips) %>% 
  group_by(date)
df_us
```


```{r}
## convert from wide to long format
data.long.us <- df_us %>%
  select(c(date, state, cases, deaths)) %>%
  gather(key=type, value=count, -c(date, state))
## set factor levels to show them in a desirable order
data.long.us %<>% mutate(type=recode_factor(type, cases='Confirmed',
                                         deaths='Deaths'))

View(data.long.us)
```


```{r}
ggplot(data.long.us,aes(x=date,y=count))+
  geom_line(aes(color=type))

```
```{r}
#gender in us
df_gender_us <- tbl(my_db, sql("select * from data_gender"))
df_gender_us <- as.data.frame(df_gender_us)
df_gender_us <- select(df_gender_us,c("State","Male","Female"))
df_gender_us <- rename(df_gender_us,"state"="State")
df_gender_us
```
```{r}
#population in us
df_pop_us <- tbl(my_db, sql("select * from data_population"))
df_pop_us <- as.data.frame(df_pop_us)
df_pop_us <- select(df_pop_us,c("State","Population"))
df_pop_us <- rename(df_pop_us,"state"="State")
df_pop_us
```
```{r}
#lockdown in us
df_lockdown_us <- tbl(my_db, sql("select * from data_lockdown"))
df_lockdown_us <- as.data.frame(df_lockdown_us)
df_lockdown_us <- select(df_lockdown_us,c("State","Day lockdown"))
df_lockdown_us <- rename(df_lockdown_us,"state"="State")
df_lockdown_us
```
```{r}
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df_Allus <- merge(x = df_us, y = df_gender_us, by = "state", all.x = TRUE) 

df_Allus <- merge(x = df_Allus, y = df_pop_us, by = "state", all.x = TRUE) 

df_Allus <- merge(x = df_Allus, y = df_lockdown_us, by = "state", all.x = TRUE) 

View(df_Allus)

```




```{r}
index <- is.na(df_Allus)
df_Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df_Allus[,3:8],2,normalize))
corr_dataUS <- Allus 

View(Allus)

```

```{r}
Allus$state <- c(df_Allus$state)
View(Allus)
```


```{r}


Allus_plot <- select(Allus,"state","cases","deaths","Male","Female","Population","Day lockdown")
Allus_plot %<>% gather(key=type, value=count, -c(state))
level_order <- factor(Allus_plot$type, 
                      level = c("cases","deaths","Male","Female","Population","Day lockdown"))
ggplot(data = Allus_plot, aes(x=state, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )



```
```{r}
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
```


```{r}
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown))
head(corr_dataUS)
cor(corr_dataUS)
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

```




















